Here is my code:

Reformatting/Cleaning Up Data

Raw Counts File

cnts <- read.csv('raw_counts_Cooper_data_final.csv')

rownames(cnts) <- cnts$X

cnts$X <- NULL

cnts <- as.matrix(cnts)

This takes raw_counts file and corrects the columns and index of the dataframe, then converts the file from a data frame to a matrix.

Meta Data

column_data <- read.csv('Cooper_Data_Meta.csv')

rownames(column_data) <- column_data$Sample

column_data <- column_data[,c("Tissue","Treatment","Stage")]


#column_data$Tissue <- factor(column_data$Tissue)
#column_data$Treatment <- factor(column_data$Treatment)
#column_data$Stage <- factor(column_data$Stage)

#column_data$Treatment <- NULL
#column_data$Stage <- NULL
#column_data$Sample <- NULL

column_data
##                    Tissue Treatment         Stage
## SRR10912052    Serous_EOC   Control            4B
## SRR10912053    Serous_EOC  Platinum            4B
## SRR10912054    Serous_EOC   Control             4
## SRR10912055    Serous_EOC  Platinum             4
## SRR10912056    Serous_EOC   Control            3C
## SRR10912057    Serous_EOC  Platinum            3C
## SRR10912058    Serous_EOC   Control            3C
## SRR10912059    Serous_EOC  Platinum            3C
## SRR10912060    Serous_EOC   Control            3C
## SRR10912061    Serous_EOC  Platinum            3C
## SRR10912062    Serous_EOC   Control             4
## SRR10912063    Serous_EOC  Platinum             4
## SRR10912064    Serous_EOC   Control            3C
## SRR10912065    Serous_EOC  Platinum            3C
## SRR10912066    Serous_EOC   Control            3C
## SRR10912067    Serous_EOC  Platinum            3C
## SRR10912068    Serous_EOC   Control            3C
## SRR10912069    Serous_EOC  Platinum            3C
## SRR10912070    Serous_EOC   Control             4
## SRR10912071    Serous_EOC  Platinum             4
## SRR10912072    Serous_EOC   Control            3C
## SRR10912073    Serous_EOC   Control            3C
## SRR10912074    Serous_EOC  Platinum            3C
## SRR10912075    Serous_EOC   Control            3C
## SRR10912076    Serous_EOC  Platinum            3C
## SRR10912077    Serous_EOC  Platinum            3C
## SRR10912078    Serous_EOC   Control            3C
## SRR10912079    Serous_EOC  Platinum            3C
## SRR10912080    Serous_EOC   Control            3C
## SRR10912081    Serous_EOC  Platinum            3C
## SRR10912082    Serous_EOC   Control             4
## SRR10912083    Serous_EOC  Platinum             4
## SRR10912084    Serous_EOC   Control            3C
## SRR10912085    Serous_EOC  Platinum            3C
## SRR10912086    Serous_EOC   Control            3C
## SRR10912087    Serous_EOC  Platinum            3C
## SRR10912088    Serous_EOC   Control            3C
## SRR10912089    Serous_EOC   Control            3C
## SRR10912090    Serous_EOC   Control            3C
## SRR10912091    Serous_EOC   Control            3C
## SRR10912092    Serous_EOC   Control            3C
## SRR10912093    Serous_EOC   Control             4
## SRR10912094    Serous_EOC   Control            3C
## SRR10912095    Serous_EOC   Control            3C
## SRR10912096    Serous_EOC   Control            3C
## SRR10912097    Serous_EOC   Control            3C
## SRR10912098    Serous_EOC   Control            3C
## SRR10912099    Serous_EOC   Control            3C
## SRR10912100    Serous_EOC   Control            3C
## SRR10912101    Serous_EOC   Control            3C
## SRR10912102    Serous_EOC   Control            3C
## SRR10912103    Serous_EOC   Control            2C
## SRR10912104    Serous_EOC   Control            3C
## SRR10912105    Serous_EOC   Control             4
## SRR10912106    Serous_EOC   Control            3C
## SRR10912107    Serous_EOC   Control            3C
## SRR10912108    Serous_EOC   Control            3C
## SRR10912109    Serous_EOC   Control             4
## SRR10912110    Serous_EOC   Control            3C
## SRR10912111    Serous_EOC   Control            3C
## SRR10912112    Serous_EOC   Control            3C
## SRR10912113    Serous_EOC   Control            3C
## SRR10912114    Serous_EOC   Control            3C
## SRR10912115    Serous_EOC   Control             4
## SRR10912116    Serous_EOC   Control            3C
## SRR10912117    Serous_EOC   Control            3C
## SRR10912118    Serous_EOC   Control            3C
## SRR10912119    Serous_EOC   Control            3C
## SRR10912120    Serous_EOC   Control             4
## SRR10912121    Serous_EOC   Control            3C
## SRR10912122    Serous_EOC   Control            3C
## SRR10912123    Serous_EOC   Control            3C
## SRR10912124    Serous_EOC   Control             4
## SRR10912125    Serous_EOC   Control            3C
## SRR10912126    Serous_EOC   Control             4
## SRR10912127 Benign_Tissue   Control patientid: 61
## SRR10912128 Benign_Tissue   Control patientid: 62
## SRR10912129 Benign_Tissue   Control patientid: 63
## SRR10912130 Benign_Tissue   Control patientid: 64
## SRR10912131 Benign_Tissue   Control patientid: 65
## SRR10912132 Benign_Tissue   Control patientid: 66
## SRR10912133 Benign_Tissue   Control patientid: 67
## SRR10912134 Benign_Tissue   Control patientid: 68
## SRR10912135 Benign_Tissue   Control patientid: 69
## SRR10912136 Benign_Tissue   Control patientid: 70
## SRR10912137 Benign_Tissue   Control patientid: 71
## SRR10912138       Ascites   Control            4A
## SRR10912139       Ascites   Control            3C
## SRR10912140       Ascites   Control            3C
## SRR10912141       Ascites   Control            3C
## SRR10912142    Serous_EOC   Control            4A
## SRR10912143    Serous_EOC   Control            3C
## SRR10912144 Benign_Tissue   Control patientid: 78
## SRR10912145    Serous_EOC   Control            3C
## SRR10912146    Serous_EOC   Control            3C
## SRR10912147       Ascites   Control            3C
## SRR10912148       Ascites   Control            3C
## SRR10912149       Ascites   Control            3C
## SRR10912150       Ascites   Control            3C
## SRR10912151       Ascites   Control            3C
## SRR10912152       Ascites   Control             4
## SRR10912153       Ascites   Control            3C
## SRR10912154       Ascites   Control            3C
## SRR10912155       Ascites   Control            3C
## SRR10912156       Ascites   Control            3C
## SRR10912157       Ascites   Control            3C
## SRR10912158       Ascites   Control            3C
## SRR10912159       Ascites   Control            3C
## SRR10912160       Ascites   Control            3C
## SRR10912161       Ascites   Control            2C
## SRR10912162       Ascites   Control            3C
## SRR10912163       Ascites   Control             4
## SRR10912164       Ascites   Control            3C
## SRR10912165       Ascites   Control            3C
## SRR10912166       Ascites   Control            3C
## SRR10912167       Ascites   Control            3C
## SRR10912168       Ascites   Control             4
## SRR10912169       Ascites   Control            3C
## SRR10912170       Ascites   Control            3C
## SRR10912171       Ascites   Control            3C
## SRR10912172       Ascites   Control             4

This takes the metadata, or column data, file and corrects the columns and index of the dataframe, then converts the file from a data frame to a matrix.

Check if rownames and columnnames Match

all(rownames(column_data) %in% colnames(cnts))
## [1] TRUE
all(rownames(column_data) == colnames(cnts))
## [1] TRUE

This checks if the rownames and column names match.

DeSeq2

library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
dds <- DESeqDataSetFromMatrix(countData = cnts,
                              colData = column_data,
                              design = ~ Tissue)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds
## class: DESeqDataSet 
## dim: 20643 121 
## metadata(1): version
## assays(1): counts
## rownames(20643): ENSG00000000003 ENSG00000000005 ... ENSG00000273439
##   ENSG00000273452
## rowData names(0):
## colnames(121): SRR10912052 SRR10912053 ... SRR10912171 SRR10912172
## colData names(3): Tissue Treatment Stage

Pre-filtering

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

Prefiltering can improve visualization as features with no information are not plotted.

Factor Levels

dds$Tissue <- relevel(dds$Tissue, ref = "Benign_Tissue")

“By default, R will choose a reference level for factors based on alphabetical order. Then, if you never tell the DESeq2 functions which level you want to compare against (e.g. which level represents the control group), the comparisons will be based on the alphabetical order of the levels. There are two solutions: you can either explicitly tell results which comparison to make using the contrast argument (this will be shown later), or you can explicitly set the factors levels. In order to see the change of reference levels reflected in the results names, you need to either run DESeq or nbinomWaldTest/nbinomLRT after the re-leveling operation.”

Setting Benign Tissue as the reference level.

Differential expression analysis

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1168 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
resultsNames(dds)
## [1] "Intercept"                          "Tissue_Ascites_vs_Benign_Tissue"   
## [3] "Tissue_Serous_EOC_vs_Benign_Tissue"
#summary(res)
res_Serous_EOC <- results(dds, name="Tissue_Serous_EOC_vs_Benign_Tissue")

res_Serous_EOC
## log2 fold change (MLE): Tissue Serous EOC vs Benign Tissue 
## Wald test p-value: Tissue Serous EOC vs Benign Tissue 
## DataFrame with 19975 rows and 6 columns
##                   baseMean log2FoldChange     lfcSE      stat      pvalue
##                  <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000000003  1359.4263      -0.790170  0.248072 -3.185252 1.44628e-03
## ENSG00000000005    28.6264      -0.512972  0.913941 -0.561275 5.74610e-01
## ENSG00000000419   945.1298       0.408815  0.156660  2.609561 9.06584e-03
## ENSG00000000457   339.0323      -0.139602  0.124784 -1.118752 2.63246e-01
## ENSG00000000460   189.3951       1.346737  0.231249  5.823762 5.75375e-09
## ...                    ...            ...       ...       ...         ...
## ENSG00000273294 18.6100019       0.115738  0.774778 0.1493822    0.881252
## ENSG00000273331  1.6135503       2.457218  1.142924 2.1499398    0.031560
## ENSG00000273398  5.7510137       0.562653  0.536606 1.0485403    0.294390
## ENSG00000273439 22.4666648       0.669630  0.415259 1.6125583    0.106841
## ENSG00000273452  0.0657423       0.197680  4.954781 0.0398967    0.968175
##                        padj
##                   <numeric>
## ENSG00000000003 3.91623e-03
## ENSG00000000005 6.64751e-01
## ENSG00000000419 1.99003e-02
## ENSG00000000457 3.58693e-01
## ENSG00000000460 5.14674e-08
## ...                     ...
## ENSG00000273294   0.9141591
## ENSG00000273331   0.0593248
## ENSG00000273398   0.3925197
## ENSG00000273439   0.1682453
## ENSG00000273452          NA
resLFC_Serous_EOC <- lfcShrink(dds, coef="Tissue_Serous_EOC_vs_Benign_Tissue", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_Serous_EOC
## log2 fold change (MAP): Tissue Serous EOC vs Benign Tissue 
## Wald test p-value: Tissue Serous EOC vs Benign Tissue 
## DataFrame with 19975 rows and 5 columns
##                   baseMean log2FoldChange     lfcSE      pvalue        padj
##                  <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000000003  1359.4263      -0.707614  0.238990 1.44628e-03 3.91623e-03
## ENSG00000000005    28.6264      -0.256894  0.633896 5.74610e-01 6.64751e-01
## ENSG00000000419   945.1298       0.397866  0.155273 9.06584e-03 1.99003e-02
## ENSG00000000457   339.0323      -0.136250  0.123498 2.63246e-01 3.58693e-01
## ENSG00000000460   189.3951       1.301129  0.234105 5.75375e-09 5.14674e-08
## ...                    ...            ...       ...         ...         ...
## ENSG00000273294 18.6100019      0.0682127  0.587723    0.881252   0.9141591
## ENSG00000273331  1.6135503      3.6905904  1.538611    0.031560   0.0593248
## ENSG00000273398  5.7510137      0.4212642  0.487248    0.294390   0.3925197
## ENSG00000273439 22.4666648      0.5644270  0.401099    0.106841   0.1682453
## ENSG00000273452  0.0657423      0.0337805  0.882672    0.968175          NA
res_Ascites <- results(dds, name="Tissue_Ascites_vs_Benign_Tissue")

res_Ascites
## log2 fold change (MLE): Tissue Ascites vs Benign Tissue 
## Wald test p-value: Tissue Ascites vs Benign Tissue 
## DataFrame with 19975 rows and 6 columns
##                   baseMean log2FoldChange     lfcSE      stat      pvalue
##                  <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000000003  1359.4263      -0.914937  0.275916  -3.31600 9.13167e-04
## ENSG00000000005    28.6264      -4.565551  1.029674  -4.43398 9.25110e-06
## ENSG00000000419   945.1298       0.487980  0.175152   2.78604 5.33562e-03
## ENSG00000000457   339.0323      -0.448853  0.141450  -3.17321 1.50761e-03
## ENSG00000000460   189.3951       1.099443  0.259719   4.23321 2.30384e-05
## ...                    ...            ...       ...       ...         ...
## ENSG00000273294 18.6100019      -0.553297  0.866688 -0.638404   0.5232107
## ENSG00000273331  1.6135503       2.834898  1.244909  2.277193   0.0227747
## ENSG00000273398  5.7510137      -0.419677  0.607245 -0.691117   0.4894919
## ENSG00000273439 22.4666648      -0.072854  0.470010 -0.155005   0.8768173
## ENSG00000273452  0.0657423       2.226346  5.458155  0.407893   0.6833519
##                        padj
##                   <numeric>
## ENSG00000000003 2.17650e-03
## ENSG00000000005 3.16468e-05
## ENSG00000000419 1.08888e-02
## ENSG00000000457 3.44937e-03
## ENSG00000000460 7.32642e-05
## ...                     ...
## ENSG00000273294   0.6021114
## ENSG00000273331   0.0401088
## ENSG00000273398   0.5705074
## ENSG00000273439   0.9009092
## ENSG00000273452   0.7426654
resLFC_Ascites <- lfcShrink(dds, coef="Tissue_Ascites_vs_Benign_Tissue", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_Ascites
## log2 fold change (MAP): Tissue Ascites vs Benign Tissue 
## Wald test p-value: Tissue Ascites vs Benign Tissue 
## DataFrame with 19975 rows and 5 columns
##                   baseMean log2FoldChange     lfcSE      pvalue        padj
##                  <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000000003  1359.4263      -0.837964  0.267011 9.13167e-04 2.17650e-03
## ENSG00000000005    28.6264      -0.844550  0.833044 9.25110e-06 3.16468e-05
## ENSG00000000419   945.1298       0.475800  0.173115 5.33562e-03 1.08888e-02
## ENSG00000000457   339.0323      -0.441299  0.139981 1.50761e-03 3.44937e-03
## ENSG00000000460   189.3951       1.050195  0.259593 2.30384e-05 7.32642e-05
## ...                    ...            ...       ...         ...         ...
## ENSG00000273294 18.6100019     -0.4558388  0.684649   0.5232107   0.6021114
## ENSG00000273331  1.6135503      0.8943902  1.171554   0.0227747   0.0401088
## ENSG00000273398  5.7510137     -0.5741662  0.558885   0.4894919   0.5705074
## ENSG00000273439 22.4666648     -0.1303708  0.426959   0.8768173   0.9009092
## ENSG00000273452  0.0657423      0.0804353  1.022513   0.6833519   0.7426654
resLFCOrdered_Serous_EOC <- resLFC_Serous_EOC[order(resLFC_Serous_EOC$pvalue),]

resLFCOrdered_Serous_EOC
## log2 fold change (MAP): Tissue Serous EOC vs Benign Tissue 
## Wald test p-value: Tissue Serous EOC vs Benign Tissue 
## DataFrame with 19975 rows and 5 columns
##                  baseMean log2FoldChange     lfcSE      pvalue        padj
##                 <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000100380 5724.9650       -2.36653  0.160123 6.74717e-50 9.08323e-46
## ENSG00000154545   61.4084       11.23218  3.529445 9.46367e-50 9.08323e-46
## ENSG00000100227 1539.3231       -1.92776  0.143622 2.46081e-41 1.57459e-37
## ENSG00000139734  173.2278        4.27102  0.321202 9.45548e-41 4.53768e-37
## ENSG00000069966  566.3977       -2.15348  0.163485 4.28627e-40 1.64559e-36
## ...                   ...            ...       ...         ...         ...
## ENSG00000184007   3630.87    0.000342523  0.127981    0.999963    0.999963
## ENSG00000154537      0.00    0.069290955  0.887184    1.000000          NA
## ENSG00000242366      0.00    0.061543666  0.885882    1.000000          NA
## ENSG00000255863      0.00    0.071625991  0.887520    1.000000          NA
## ENSG00000268485      0.00    0.068737657  0.887100    1.000000          NA
resLFCOrdered_Ascites <- resLFC_Ascites[order(resLFC_Ascites$pvalue),]

resLFCOrdered_Ascites
## log2 fold change (MAP): Tissue Ascites vs Benign Tissue 
## Wald test p-value: Tissue Ascites vs Benign Tissue 
## DataFrame with 19975 rows and 5 columns
##                  baseMean log2FoldChange     lfcSE      pvalue        padj
##                 <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000115461 16817.080       -6.76674  0.420994 4.15654e-58 8.30102e-54
## ENSG00000100380  5724.965       -2.74955  0.178141 2.86234e-54 2.85819e-50
## ENSG00000072840   769.001       -3.36303  0.223246 7.75342e-52 5.16145e-48
## ENSG00000100227  1539.323       -2.41606  0.161322 2.93616e-51 1.46595e-47
## ENSG00000113658  1041.526       -2.28680  0.158517 1.05393e-47 4.20962e-44
## ...                   ...            ...       ...         ...         ...
## ENSG00000147596   3.33851     -0.1884468  0.527538    0.999197    0.999197
## ENSG00000154537   0.00000     -0.0302969  1.017772    1.000000          NA
## ENSG00000242366   0.00000     -0.0277997  1.017821    1.000000          NA
## ENSG00000255863   0.00000     -0.0311519  1.017759    1.000000          NA
## ENSG00000268485   0.00000     -0.0301904  1.017780    1.000000          NA
summary(resLFCOrdered_Serous_EOC)
## 
## out of 19971 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 6618, 33%
## LFC < 0 (down)     : 4518, 23%
## outliers [1]       : 0, 0%
## low counts [2]     : 779, 3.9%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(resLFCOrdered_Ascites)
## 
## out of 19971 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 6636, 33%
## LFC < 0 (down)     : 6123, 31%
## outliers [1]       : 0, 0%
## low counts [2]     : 4, 0.02%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
plotMA(resLFCOrdered_Serous_EOC, ylim=c(-2,2))

idx_LFCOrdered_Serous_EOC <- identify(resLFCOrdered_Serous_EOC$baseMean, resLFCOrdered_Serous_EOC$log2FoldChange)

rownames(resLFCOrdered_Serous_EOC)[idx_LFCOrdered_Serous_EOC]
## character(0)
plotMA(resLFCOrdered_Ascites, ylim=c(-2,2))

plotMA(resLFCOrdered_Serous_EOC, ylim=c(-2,2))

idx_LFCOrdered_Serous_EOC <- identify(resLFCOrdered_Serous_EOC$baseMean, resLFCOrdered_Serous_EOC$log2FoldChange)

rownames(resLFCOrdered_Serous_EOC)[idx_LFCOrdered_Serous_EOC]
## character(0)
resNorm <- lfcShrink(dds, coef=3, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=3, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
resLFC <- lfcShrink(dds, coef=3, type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(dds, gene=which.min(resLFC_Serous_EOC$padj), intgroup="Tissue")

plotCounts(dds, gene=which.min(resLFC_Ascites$padj), intgroup="Tissue")

vsd <- vst(dds, blind=FALSE)
#rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds)
library("vsn")
meanSdPlot(assay(ntd))

meanSdPlot(assay(vsd))

#meanSdPlot(assay(rld))
library("pheatmap")
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("Tissue","Treatment")])
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=df)

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=df)

#pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE,
#         cluster_cols=FALSE, annotation_col=df)
sampleDists <- dist(t(assay(vsd)))

library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

plotPCA(vsd, intgroup=c("Tissue"))

results(dds, contrast=c("Tissue","Serous_EOC","Benign_Tissue"))
## log2 fold change (MLE): Tissue Serous_EOC vs Benign_Tissue 
## Wald test p-value: Tissue Serous EOC vs Benign Tissue 
## DataFrame with 19975 rows and 6 columns
##                   baseMean log2FoldChange     lfcSE      stat      pvalue
##                  <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSG00000000003  1359.4263      -0.790170  0.248072 -3.185252 1.44628e-03
## ENSG00000000005    28.6264      -0.512972  0.913941 -0.561275 5.74610e-01
## ENSG00000000419   945.1298       0.408815  0.156660  2.609561 9.06584e-03
## ENSG00000000457   339.0323      -0.139602  0.124784 -1.118752 2.63246e-01
## ENSG00000000460   189.3951       1.346737  0.231249  5.823762 5.75375e-09
## ...                    ...            ...       ...       ...         ...
## ENSG00000273294 18.6100019       0.115738  0.774778 0.1493822    0.881252
## ENSG00000273331  1.6135503       2.457218  1.142924 2.1499398    0.031560
## ENSG00000273398  5.7510137       0.562653  0.536606 1.0485403    0.294390
## ENSG00000273439 22.4666648       0.669630  0.415259 1.6125583    0.106841
## ENSG00000273452  0.0657423       0.197680  4.954781 0.0398967    0.968175
##                        padj
##                   <numeric>
## ENSG00000000003 3.91623e-03
## ENSG00000000005 6.64751e-01
## ENSG00000000419 1.99003e-02
## ENSG00000000457 3.58693e-01
## ENSG00000000460 5.14674e-08
## ...                     ...
## ENSG00000273294   0.9141591
## ENSG00000273331   0.0593248
## ENSG00000273398   0.3925197
## ENSG00000273439   0.1682453
## ENSG00000273452          NA
resApeT <- lfcShrink(dds, coef=2, type="apeglm", lfcThreshold=1)
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## computing FSOS 'false sign or small' s-values (T=1)
plotMA(resApeT, ylim=c(-3,3), cex=.8)
## thresholding s-values on alpha=0.005 to color points
abline(h=c(-1,1), col="dodgerblue", lwd=2)

resAshT <- lfcShrink(dds, coef=2, type="ashr", lfcThreshold=1)
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## computing FSOS 'false sign or small' s-values (T=1)
plotMA(resAshT, ylim=c(-3,3), cex=.8)
## thresholding s-values on alpha=0.005 to color points
abline(h=c(-1,1), col="dodgerblue", lwd=2)

par(mar=c(8,5,2,2))
boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)

plotDispEsts(dds)

metadata(resLFC_Serous_EOC)$alpha
## [1] 0.1
metadata(resLFC_Serous_EOC)$filterThreshold
## 3.896759% 
## 0.2997622
plot(metadata(resLFC_Serous_EOC)$filterNumRej, 
     type="b", ylab="number of rejections",
     xlab="quantiles of filter")
lines(metadata(resLFC_Serous_EOC)$lo.fit, col="red")
abline(v=metadata(resLFC_Serous_EOC)$filterTheta)

resNoFilt <- results(dds, independentFiltering=FALSE)
addmargins(table(filtering=(resLFC_Serous_EOC$padj < .1),
                 noFiltering=(resNoFilt$padj < .1)))
##          noFiltering
## filtering FALSE  TRUE   Sum
##     FALSE  8060     0  8060
##     TRUE     84 11052 11136
##     Sum    8144 11052 19196
metadata(resLFC_Ascites)$alpha
## [1] 0.1
metadata(resLFC_Ascites)$filterThreshold
## 0.02002503% 
## 0.005648919
plot(metadata(resLFC_Ascites)$filterNumRej, 
     type="b", ylab="number of rejections",
     xlab="quantiles of filter")
lines(metadata(resLFC_Ascites)$lo.fit, col="red")
abline(v=metadata(resLFC_Ascites)$filterTheta)

resNoFilt <- results(dds, independentFiltering=FALSE)
addmargins(table(filtering=(resLFC_Ascites$padj < .1),
                 noFiltering=(resNoFilt$padj < .1)))
##          noFiltering
## filtering FALSE  TRUE   Sum
##     FALSE  5118  2094  7212
##     TRUE   3801  8958 12759
##     Sum    8919 11052 19971
par(mfrow=c(2,2),mar=c(2,2,1,1))
ylim <- c(-2.5,2.5)
resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs")
resLA <- results(dds, lfcThreshold=.5, altHypothesis="lessAbs")
resG <- results(dds, lfcThreshold=.5, altHypothesis="greater")
resL <- results(dds, lfcThreshold=.5, altHypothesis="less")
drawLines <- function() abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotMA(resLA, ylim=ylim); drawLines()
plotMA(resG, ylim=ylim); drawLines()
plotMA(resL, ylim=ylim); drawLines()

#mcols(dds,use.names=TRUE)[1:4,1:4]

#substr(names(mcols(dds)),1,10) 

#mcols(mcols(dds), use.names=TRUE)[1:4,]

#head(assays(dds)[["mu"]])

#head(assays(dds)[["cooks"]])

#head(dispersions(dds))

#head(mcols(dds)$dispersion)

#sizeFactors(dds)

#head(coef(dds))

#attr(dds, "betaPriorVar")

#priorInfo(resLFC)